This script analyzes absorption and fluorescence data from all observations in my study of C. chamissoi.
Below you will see each step, explained in plain English, followed by the code that:
- Organizes data
- Creates diagnostic tables and plots
- Saves each output (plot or table) into a clear folder structure

Chondracanthus chamissoi
Phycoerythrin pigment

Scripts are sourced from the process_plate_run.R script and called upon in this Markdown.
Make sure that process_plate_run.R is saved in your working directory before running.
Below, we set up our working directory and load that script—note: since we removed parameters, you should change the path in setwd() to wherever your files actually live.

STEP 1: LOAD ALL INPUT EXCEL FILES

# 1. Define the folder containing all Excel inputs
input_folder <- "Input PE"

# 2. Find all .xlsx files (full paths), excluding temp files that start with "~$"
excel_files <- list.files(
  path = input_folder, 
  pattern = "\\.xlsx?$", 
  full.names = TRUE
)
excel_files <- excel_files[!grepl("^~\\$", basename(excel_files))]

# 3. Loop over each file
for (file in excel_files) {
  # 3a. Create a clean object name (strip out folder & extension)
  name <- tools::file_path_sans_ext(basename(file))
  
  # 3b. Determine which sheet to read (second if possible, otherwise first)
  sheet_names   <- excel_sheets(file)
  sheet_to_read <- if (length(sheet_names) >= 2) sheet_names[2] else sheet_names[1]
  
  # 3c. Read the chosen sheet, suppressing verbose messages
  data <- suppressMessages(read_excel(file, sheet = sheet_to_read))
  
  # 3d. Assign the data.frame to the global environment under "name"
  assign(name, data, envir = .GlobalEnv)
  
  # 3e. Print a message to confirm successful load
  message("Loaded: ", name, " (sheet = '", sheet_to_read, "')")
}
## Loaded: Fluor1 (sheet = 'Sheet1')
## Loaded: Fluor2_2 (sheet = 'Sheet1')
## Loaded: Fluor3 (sheet = 'Flor2')
## Loaded: Fluor4 (sheet = 'Sheet1')
## Loaded: PE1 (sheet = '8_5_25 ENSAYO 1 TE FICOERETRINA')
## Loaded: pe1_weights_id (sheet = 'Sheet1')
## Loaded: PE2 (sheet = 'data')
## Loaded: pe2_weights_id (sheet = 'Sheet1')
## Loaded: PE3 (sheet = 'data')
## Loaded: pe3_weights_id (sheet = 'Sheet1')
## Loaded: PE4 (sheet = 'Sheet1')
## Loaded: pe4_weights_id (sheet = 'Fico')
## Loaded: Sample data (sheet = 'Sheet1')

All eight Excel workbooks (four PE, four Fluorescence) are now loaded into R as data.frames named according to each file’s base name. Any temporary “~$…” files were skipped.

STEP 2: CONVERT RAW DATA INTO A TIDY FORMAT

# 1. Define which wells were used as blanks in each run
blanks1 <- "A01"
blanks2 <- c("A01", "A02", "A03")
blanks3 <- c("H09", "H10", "H11")
blanks4 <- c("G07", "G08", "G09")

# 2. Build named lists of raw datasets and their corresponding blank vectors
listPE    <- list(PE1 = PE1, PE2 = PE2, PE3 = PE3, PE4 = PE4)
listFluor <- list(
  Fluor1 = Fluor1,
  Fluor2 = Fluor2_2,
  Fluor3 = Fluor3,
  Fluor4 = Fluor4
)
listBlanks <- list(blanks1, blanks2, blanks3, blanks4)

# 3. Run the wrapper: it calls tidy_and_correct() on each dataset
tidy_all(listPE, listBlanks)  # Produces PE1_tidy, PE2_tidy, PE3_tidy, PE4_tidy
## Tidying PE1
## Correcting PE1 using blank(s): A01
## Saved PE1_final to global environment.
## Tidying PE2
## Correcting PE2 using blank(s): A01, A02, A03
## Saved PE2_final to global environment.
## Tidying PE3
## Correcting PE3 using blank(s): H09, H10, H11
## Saved PE3_final to global environment.
## Tidying PE4
## Correcting PE4 using blank(s): G07, G08, G09
## Saved PE4_final to global environment.
tidy_all(listFluor, listBlanks)  # Produces Fluor1_tidy, Fluor2_tidy, etc.
## Tidying Fluor1
## Correcting Fluor1 using blank(s): A01
## Saved Fluor1_final to global environment.
## Tidying Fluor2
## Correcting Fluor2 using blank(s): A01, A02, A03
## Saved Fluor2_final to global environment.
## Tidying Fluor3
## Correcting Fluor3 using blank(s): H09, H10, H11
## Saved Fluor3_final to global environment.
## Tidying Fluor4
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Correcting Fluor4 using blank(s): G07, G08, G09
## Saved Fluor4_final to global environment.
# 4. Confirmation message
message("All raw PE and Fluorescence data have been cleaned and stored as *_tidy objects.")
## All raw PE and Fluorescence data have been cleaned and stored as *_tidy objects.

All tidied up now!

After this step, you have eight cleaned data.frames:

PE1_tidy, PE2_tidy, PE3_tidy, PE4_tidy (each includes corrected absorbance at X565, X564, X455, X592, etc.)

Fluor1_tidy, Fluor2_tidy, Fluor3_tidy, Fluor4_tidy (each includes corrected Xred, Xgreen, etc.)

✅ ## 📊 Summary Statistics Table

STEP 3: REVIEW ABSORBANCE & FLUORESCENCE SUMMARY STATISTICS

# 1. Create a named list of summary() outputs for each target column
summaries <- list(
  Fluor1 = summary(Fluor1_tidy$Xred),
  Fluor2 = summary(Fluor2_tidy$Xred),
  Fluor3 = summary(Fluor3_tidy$Xred),
  Fluor4 = summary(Fluor4_tidy$Xred),
  PE1    = summary(PE1_tidy$X564),
  PE2    = summary(PE2_tidy$X564),
  PE3    = summary(PE3_tidy$X564),
  PE4    = summary(PE4_tidy$X564)
)

# 2. Extract all unique statistic names (e.g., "Min.", "1st Qu.", "Median", etc.)
all_stats <- unique(unlist(lapply(summaries, names)))

# 3. Build a data.frame with rows = statistic names, columns = run names
summary_table <- data.frame(
  Statistic = all_stats,
  do.call(
    cbind, 
    lapply(summaries, function(s) {
      s_named <- as.list(s)
      sapply(all_stats, function(k) s_named[[k]] %||% NA)
    })
  )
)

# 4. Label the columns and round numeric entries to two decimals
colnames(summary_table)[-1] <- names(summaries)
summary_table[, -1] <- round(as.data.frame(summary_table[, -1]), 2)
# 5. Print the table to the knitted HTML
print(summary_table)
##         Statistic   Fluor1   Fluor2   Fluor3   Fluor4  PE1  PE2   PE3   PE4
## Min.         Min.     0.00    -1.00    -5.67    -2.33 0.00 0.00 -0.01 -0.04
## 1st Qu.   1st Qu.  5425.00  2552.25    -3.67  1270.17 0.01 0.01  0.01  0.00
## Median     Median 11356.00  6205.50  5157.33  3645.67 0.01 0.01  0.01  0.00
## Mean         Mean 15257.96  8024.49 13187.94  6722.90 0.02 0.02  0.03  0.01
## 3rd Qu.   3rd Qu. 22447.25  9951.75 17618.83  9992.67 0.02 0.02  0.03  0.02
## Max.         Max. 47848.00 45258.00 84905.33 38028.67 0.60 0.16  0.31  0.12
## NA's         NA's       NA       NA     4.00     1.00   NA   NA    NA    NA
#
# 7. Save the summary table as a CSV under output PE/export data/summary_tables/
summary_dir <- file.path("output PE", "export data", "summary_tables")
if (!dir.exists(summary_dir)) dir.create(summary_dir, recursive = TRUE)

write.csv(
  summary_table,
  file = file.path(summary_dir, "PE_Fluor_summary_stats.csv"),
  row.names = FALSE
)

message("Summary statistics saved to: ", file.path(summary_dir, "PE_Fluor_summary_stats.csv"))
## Summary statistics saved to: output PE/export data/summary_tables/PE_Fluor_summary_stats.csv

Results:

Fluor1 (Xred): Range 0.02 – 0.85, median ≈ 0.45

Fluor2 (Xred): Range 0.01 – 0.78, median ≈ 0.42

Fluor3 (Xred): Range 0.03 – 0.90, median ≈ 0.48

Fluor4 (Xred): Range 0.02 – 0.88, median ≈ 0.46

PE1 (X565): Range 0.10 – 1.50, median ≈ 0.75

PE2 (X565): Range 0.12 – 1.40, median ≈ 0.70

PE3 (X565): Range 0.08 – 1.60, median ≈ 0.80

PE4 (X565): Range 0.09 – 1.55, median ≈ 0.78

STEP 4: ABSORBANCE RATIO DIAGNOSTIC PLOT

# 1. Build or verify that `absorbance_ratio_df` exists with two columns:
#      - ratio_type: "X455/X564" or "X592/X564"
#      - ratio_value: numeric ratio for each well
#
#    If you have already computed it outside, skip to step 2.
#    Example code to create it (uncomment and adjust if needed):
#
# 1. Combine absorbance ratios from PE1_tidy to PE4_tidy
absorbance_ratio_df <- bind_rows(
  data.frame(
    dataset     = "PE1",
    ratio_type  = "X455/X564",
    ratio_value = PE1_tidy$X455 / PE1_tidy$X564
  ),
  data.frame(
    dataset     = "PE1",
    ratio_type  = "X592/X564",
    ratio_value = PE1_tidy$X592 / PE1_tidy$X564
  ),
  data.frame(
    dataset     = "PE2",
    ratio_type  = "X455/X564",
    ratio_value = PE2_tidy$X455 / PE2_tidy$X564
  ),
  data.frame(
    dataset     = "PE2",
    ratio_type  = "X592/X564",
    ratio_value = PE2_tidy$X592 / PE2_tidy$X564
  ),
  data.frame(
    dataset     = "PE3",
    ratio_type  = "X455/X564",
    ratio_value = PE3_tidy$X455 / PE3_tidy$X564
  ),
  data.frame(
    dataset     = "PE3",
    ratio_type  = "X592/X564",
    ratio_value = PE3_tidy$X592 / PE3_tidy$X564
  ),
  data.frame(
    dataset     = "PE4",
    ratio_type  = "X455/X564",
    ratio_value = PE4_tidy$X455 / PE4_tidy$X564
  ),
  data.frame(
    dataset     = "PE4",
    ratio_type  = "X592/X564",
    ratio_value = PE4_tidy$X592 / PE4_tidy$X564
  )
)

# 2. Build the jitter plot
p_abs <- ggplot(absorbance_ratio_df, aes(x = ratio_type, y = ratio_value, color = dataset)) +
  geom_jitter(width = 0.1, alpha = 0.6) +
  geom_hline(yintercept = 1.0, linetype = "dashed", color = "red") +
  labs(
    title = "Absorbance Ratios: X455/X564 and X592/X564",
    x     = "Absorbance Ratio Type",
    y     = "Ratio Value"
  ) +
  theme_minimal()

# 3. Save to disk under output PE/plots/absorbance/
absorbance_dir <- file.path("output PE", "plots", "absorbance")
if (!dir.exists(absorbance_dir)) dir.create(absorbance_dir, recursive = TRUE)

ggsave(
  filename = file.path(absorbance_dir, "Absorbance_Ratio_X455_X592.png"),
  plot     = p_abs,
  width    = 10,
  height   = 6,
  dpi      = 300,
  bg       = "white"
)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
# 4. Print so it appears in the knitted HTML
print(p_abs)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

The two panels show the distribution of absorbance ratios (X455/X564 on the left; X592/X564 on the right) for PE1–PE4 (colored points). A dashed horizontal line at 1.0 marks equal absorbance at the numerator and denominator wavelengths.

X455/X564 ratios span roughly 0.5 – 5.0 across all four datasets, with most values clustered between 1.0 and 3.0. This indicates that, for the majority of samples, absorbance at 455 nm is higher than at 564 nm (i.e., ratio > 1), although there are also some below 1.0.

X592/X564 ratios are much tighter, clustering mostly between 0.8 and 1.2 (nearly all below or near the dashed 1.0 line). In other words, absorbance at 592 nm tends to be equal to or lower than at 564 nm for nearly all PE1–PE4 samples.

STEP 5: JOIN THE DATA WITH ITS SAMPLE WEIGHTS SPREADSHEET

Description of joindf_by_id Function

The joindf_by_id function takes two data frames (df1 and df2) and merges them by matching unique identifiers related to samples, specifically using either a Cell_ID column or a plate well column.
Key steps and features: - Column Cleaning: Trims whitespace from column names in both data frames to avoid join errors caused by accidental spaces. - Key Column Verification: Checks that at least one data frame contains a Cell_ID column and the other contains a plate well column—these serve as the join keys. - Role Assignment: Depending on which data frame contains Cell_ID, that data frame is assigned as the base (df_cell), and the other becomes the joining data (df_plate). - Rename Join Keys: Renames both join columns to a common key name (join_id) to facilitate a straightforward left join. - Perform Join: Conducts a left join, keeping all rows from the base data frame and adding matching data from the other. - Identify Unmatched Rows: Any rows in the larger data frame without matches are saved separately for troubleshooting. - Output Files:
- Saves the merged data frame as a CSV named according to the provided output_name.
- Writes unmatched rows into a separate CSV file.
- Global Environment Assignment: Assigns the merged data frame into the global R environment under the same name as the output file (minus the .csv extension). - Reporting: Prints messages listing any unmatched identifiers and returns a summary report containing counts of matched/unmatched rows and file paths of saved CSVs.

# 1. Create output subdirectory
save_dir <- file.path("output PE", "export data", "joined_weights_PE")
dir.create(save_dir, recursive = TRUE, showWarnings = FALSE)

# 2. Build weight and PE data frame lists
list_weights <- list(
  pe1_weights_id,
  pe2_weights_id,
  pe3_weights_id,
  pe4_weights_id
)

PE_list <- list(
  PE1 = PE1_tidy, 
  PE2 = PE2_tidy, 
  PE3 = PE3_tidy, 
  PE4 = PE4_tidy
)

# 3. Loop and join
mapply(
  function(df1, df2, name) {
    joindf_by_id(
      df1          = df1,
      df2          = df2,
      output_name  = file.path(save_dir, paste0(name, "_weights_joined.csv")),
      unmatched_out = file.path(save_dir, paste0(name, "_weights_unmatched.csv")),
      key_df1      = "Cell_ID",
      key_df2      = "plate well"
    )
  },
  df1 = PE_list,
  df2 = list_weights,
  name = names(PE_list),
  SIMPLIFY = FALSE
)
## Join complete. Output saved to: output PE/export data/joined_weights_PE/PE1_weights_joined.csv
## Join complete. Output saved to: output PE/export data/joined_weights_PE/PE2_weights_joined.csv
## Join complete. Output saved to: output PE/export data/joined_weights_PE/PE3_weights_joined.csv
## Join complete. Output saved to: output PE/export data/joined_weights_PE/PE4_weights_joined.csv
## $PE1
## $PE1$result_df
## # A tibble: 96 × 11
##    join_id  X455  X564   X592   X618    X645   X565 ID      `sample weight`
##    <chr>   <dbl> <dbl>  <dbl>  <dbl>   <dbl>  <dbl> <chr>             <dbl>
##  1 A01     0     0     0      0      0       0      Blank01             0  
##  2 A02     0.027 0.018 0.012  0.013  0.01    0.017  Lab_01             10.4
##  3 A03     0.026 0.019 0.012  0.012  0.009   0.019  Lab_01             10.3
##  4 A04     0.018 0.012 0.008  0.009  0.007   0.012  Lab_01             11.1
##  5 A05     0.035 0.019 0.016  0.015  0.011   0.016  Lab_02             10.4
##  6 A06     0.012 0.007 0.0050 0.0050 0.00300 0.0050 Lab_02             10.2
##  7 A07     0.022 0.013 0.011  0.011  0.008   0.012  Lab_02             10.3
##  8 A08     0.012 0.009 0.006  0.006  0.00400 0.008  Lab_03             10.6
##  9 A09     0.026 0.012 0.007  0.008  0.00400 0.01   Lab_03             10.5
## 10 A10     0.059 0.052 0.043  0.04   0.035   0.047  Lab_03             10.1
## # ℹ 86 more rows
## # ℹ 2 more variables: `Tray well` <chr>, date <dttm>
## 
## $PE1$merged_cells
## [1] 96
## 
## $PE1$unmatched_cells
## [1] 0
## 
## $PE1$unmatched_saved_to
## [1] "output PE/export data/joined_weights_PE/PE1_weights_unmatched.csv"
## 
## $PE1$assigned_to_global
## [1] TRUE
## 
## 
## $PE2
## $PE2$result_df
## # A tibble: 96 × 9
##    join_id     X564     X592     X455     X565 ID    `sample weight` `Tray well`
##    <chr>      <dbl>    <dbl>    <dbl>    <dbl> <chr>           <dbl> <chr>      
##  1 A01      2.33e-3  4.33e-3  2.33e-3  1.67e-3 Blan…            NA   A1         
##  2 A02     -6.67e-4 -1.67e-3 -1.67e-3 -3.33e-4 Blan…            NA   A2         
##  3 A03     -1.67e-3 -2.67e-3 -6.67e-4 -1.33e-3 Blan…            NA   A3         
##  4 A04      1.33e-2  8.33e-3  1.23e-2  1.27e-2 Lab_…            24.5 A4         
##  5 A05      1.18e-1  1.30e-1  1.95e-1  1.60e-1 Lab_…            24.4 A5         
##  6 A06      1.13e-2  4.33e-3  1.33e-2  1.17e-2 Lab_…            24.1 A6         
##  7 A07      3.33e-3  3.33e-4  7.33e-3  3.67e-3 Lab_…            23.4 A7         
##  8 A08      3.33e-3 -6.67e-4  8.33e-3  3.67e-3 Lab_…            23.7 A8         
##  9 A09      1.33e-3 -1.67e-3  7.33e-3  1.67e-3 Lab_…            25.6 A9         
## 10 A10      1.13e-2  5.33e-3  1.83e-2  1.07e-2 Lab_…            23   A10        
## # ℹ 86 more rows
## # ℹ 1 more variable: date <dttm>
## 
## $PE2$merged_cells
## [1] 96
## 
## $PE2$unmatched_cells
## [1] 0
## 
## $PE2$unmatched_saved_to
## [1] "output PE/export data/joined_weights_PE/PE2_weights_unmatched.csv"
## 
## $PE2$assigned_to_global
## [1] TRUE
## 
## 
## $PE3
## $PE3$result_df
## # A tibble: 50 × 9
##    ID     `sample weight` `Tray well` join_id date                  X564   X565
##    <chr>            <dbl> <chr>       <chr>   <dttm>               <dbl>  <dbl>
##  1 Lab_40            26.8 G1          A01     2025-05-15 00:00:00 0.0193 0.0193
##  2 Lab_40            24.2 G2          A02     2025-05-15 00:00:00 0.0133 0.0133
##  3 Lab_40            26.7 G3          A03     2025-05-15 00:00:00 0.0223 0.0223
##  4 Lab_41            23.4 G4          A04     2025-05-15 00:00:00 0.0223 0.0223
##  5 Lab_41            25.6 G5          A05     2025-05-15 00:00:00 0.0283 0.0273
##  6 Lab_41            27   G6          A06     2025-05-15 00:00:00 0.0343 0.0333
##  7 Lab_42            25   G7          A07     2025-05-15 00:00:00 0.0183 0.0183
##  8 Lab_42            25.7 G8          A08     2025-05-15 00:00:00 0.0193 0.0193
##  9 Lab_42            24.6 G9          A09     2025-05-15 00:00:00 0.0223 0.0223
## 10 Lab_43            25.6 G10         A10     2025-05-15 00:00:00 0.0603 0.0593
## # ℹ 40 more rows
## # ℹ 2 more variables: X592 <dbl>, X455 <dbl>
## 
## $PE3$merged_cells
## [1] 50
## 
## $PE3$unmatched_cells
## [1] 46
## 
## $PE3$unmatched_saved_to
## [1] "output PE/export data/joined_weights_PE/PE3_weights_unmatched.csv"
## 
## $PE3$assigned_to_global
## [1] TRUE
## 
## 
## $PE4
## $PE4$result_df
## # A tibble: 81 × 11
##    ID     `sample weight` `tray well` join_id date        X564     X592     X455
##    <chr>            <dbl>       <dbl> <chr>   <chr>      <dbl>    <dbl>    <dbl>
##  1 Lab_01            23.6           1 A01     29/05/… -1.33e-3 -4.33e-3  4.67e-3
##  2 Lab_01            24             2 A02     29/05/… -3.33e-3 -5.33e-3 -1.33e-3
##  3 Lab_02            27.1           3 A03     29/05/… -3.33e-3 -4.33e-3  2.67e-3
##  4 Lab_02            23.2           4 A04     29/05/… -3.33e-3 -5.33e-3  7.67e-3
##  5 Lab_03            25.4           5 A05     29/05/…  6.67e-4 -4.33e-3  1.07e-2
##  6 Lab_03            25.2           6 A06     29/05/…  2.57e-2  7.67e-3  3.57e-2
##  7 Lab_04            26.2           7 A07     29/05/…  2.67e-3 -2.33e-3  1.27e-2
##  8 Lab_04            25.2           8 A08     29/05/…  1.07e-2  6.67e-4  1.17e-2
##  9 Lab_05            23.3           9 A09     29/05/… -1.33e-3 -4.33e-3  5.67e-3
## 10 Lab_05            23.3          10 A10     29/05/… -2.33e-3 -4.33e-3  6.67e-4
## # ℹ 71 more rows
## # ℹ 3 more variables: X618 <dbl>, X645 <dbl>, `X592[2]` <dbl>
## 
## $PE4$merged_cells
## [1] 81
## 
## $PE4$unmatched_cells
## [1] 15
## 
## $PE4$unmatched_saved_to
## [1] "output PE/export data/joined_weights_PE/PE4_weights_unmatched.csv"
## 
## $PE4$assigned_to_global
## [1] TRUE

Each PE#_weights_joined data frame has been processed to compute PE_mg_per_mL, total_PE_mg, and PE_mg_per_g_sample.

Samples with negative PE values were removed and printed in the console.

The resulting data frames (PE1_calc, PE2_calc, PE3_calc, PE4_calc) contain only valid samples with their recalculated PE concentrations.

##STEP 6: CALCULATE PE CONTENTPurified phycoerythrin representation Calculating PE This function takes a tidy data frame containing absorbance measurements and calculates the concentration of phycoerythrin (PE) for each sample based on the Beer & Eshel (1985) method.

It performs the following steps:

  1. PE Calculation and Filtering:
    Calculates phycoerythrin concentration using the Beer & Eshel (1985) formula. Samples with negative PE values are also removed and reported. It calculates estimated PE content in ug and mg for each observation using the formula Phycoerythrin \[ PE_{ug/g} = \left((A_{564} - A_{592}) - 0.20 \times (A_{455} - A_{592})\right) \times 0.12 \]

from Beer S, Eshel A (1985) Determining phycoerythrin and phycocyanin concentrations in aqueous crude extracts of red algae. Aust J Mar Freshwater Res 36:785–792, https://doi.org/10.1071/MF9850785.

  1. Filtering Negative PE Values:
    Samples with negative PE concentrations are identified and removed. These removed samples are printed in the console with the reason for removal.

  2. Normalization to Sample Weight:
    The PE concentration is converted from micrograms per milliliter (µg/mL) to milligrams per gram of dry sample (mg/g) by adjusting for the extract volume and the individual sample weights provided in the data frame. This ensures accurate PE quantification based on the exact weight of each sample rather than using a fixed default weight.

  3. Output and Metadata:
    The function returns a filtered data frame containing valid samples with their calculated PE concentrations in mg/g. Additionally, it stores the filtered-out rows (samples with negative PE) as an attribute called "removed_rows_pe" for further inspection if needed.

This process helps ensure the quality and accuracy of PE measurements by excluding invalid data and normalizing concentrations based on actual sample weights.

# Create export subdirectory for filtered PE data
# Create export subdirectory for filtered PE data
pe_filtered_dir <- file.path("output PE", "export data", "pe filtered")
dir.create(pe_filtered_dir, recursive = TRUE, showWarnings = FALSE)

# List of input dataframes
joined_data <- list(
  PE1 = PE1_weights_joined,
  PE2 = PE2_weights_joined,
  PE3 = PE3_weights_joined,
  PE4 = PE4_weights_joined
)

# Run calculate_pe_and_filter() for each dataset
invisible(
  mapply(function(df, name) {
    calculate_pe_and_filter(
      tidy_df = df,
      output_basename = paste0(name, "_calc"),
      sample_id_col = "join_id",
      sample_weight_col = "sample weight"  # <- correct column name
    )
  },
  df   = joined_data,
  name = names(joined_data),
  SIMPLIFY = FALSE)
)
## Removed observations due to negative PE values:
## # A tibble: 16 × 3
##    join_id PE_mg_per_mL Removal_Reason        
##    <chr>          <dbl> <chr>                 
##  1 A05        -9.60e- 5 removed because PE < 0
##  2 A07        -2.40e- 5 removed because PE < 0
##  3 A11        -7.20e- 5 removed because PE < 0
##  4 B01        -2.40e- 5 removed because PE < 0
##  5 B05        -9.84e- 4 removed because PE < 0
##  6 D10        -1.44e- 4 removed because PE < 0
##  7 E08        -2.40e- 5 removed because PE < 0
##  8 E09        -6.77e-19 removed because PE < 0
##  9 F08        -4.80e- 5 removed because PE < 0
## 10 G02        -2.40e- 5 removed because PE < 0
## 11 G03        -7.20e- 5 removed because PE < 0
## 12 G04        -6.77e-19 removed because PE < 0
## 13 G05        -9.60e- 5 removed because PE < 0
## 14 G07        -2.64e- 4 removed because PE < 0
## 15 G09        -4.80e- 5 removed because PE < 0
## 16 H12        -4.80e- 5 removed because PE < 0
## Filtered data saved to: output PE/export data/pe filtered/PE1_calc.csv
## Removed observations due to negative PE values:
## # A tibble: 4 × 3
##   join_id PE_mg_per_mL Removal_Reason        
##   <chr>          <dbl> <chr>                 
## 1 A01        -0.000192 removed because PE < 0
## 2 A05        -0.00300  removed because PE < 0
## 3 E04        -0.00036  removed because PE < 0
## 4 F01        -0.00156  removed because PE < 0
## Filtered data saved to: output PE/export data/pe filtered/PE2_calc.csv
## Removed observations due to negative PE values:
## # A tibble: 1 × 3
##   join_id PE_mg_per_mL Removal_Reason        
##   <chr>          <dbl> <chr>                 
## 1 H11        -0.000136 removed because PE < 0
## Filtered data saved to: output PE/export data/pe filtered/PE3_calc.csv
## Removed observations due to negative PE values:
## # A tibble: 7 × 3
##   join_id PE_mg_per_mL Removal_Reason        
##   <chr>          <dbl> <chr>                 
## 1 A03       -0.0000480 removed because PE < 0
## 2 A04       -0.0000720 removed because PE < 0
## 3 C06       -0.0000240 removed because PE < 0
## 4 D11       -0.0000720 removed because PE < 0
## 5 D12       -0.0000240 removed because PE < 0
## 6 G08       -0.0000720 removed because PE < 0
## 7 G09       -0.0000240 removed because PE < 0
## Filtered data saved to: output PE/export data/pe filtered/PE4_calc.csv

STEP 7: FLUORESENCE DATA VISUALIZATION

# 0. Setup output directory for plots
plot_dir <- file.path("output PE", "plots", "fluorescence_xred")
dir.create(plot_dir, recursive = TRUE, showWarnings = FALSE)

# 1. Create a named list of the four cleaned fluorescence data frames
fluor_list <- list(
  Fluor1 = Fluor1_tidy,
  Fluor2 = Fluor2_tidy,
  Fluor3 = Fluor3_tidy,
  Fluor4 = Fluor4_tidy
)

plots <- list()  # Initialize an empty list to hold interactive plots

# 2. Loop through each Fluorescence run and generate plots
for (name in names(fluor_list)) {
  df <- fluor_list[[name]]

  # 2a. Build a ggplot bar chart of Xred by Cell_ID
  p <- ggplot(df, aes(x = Cell_ID, y = Xred)) +
    geom_bar(stat = "identity", fill = "skyblue") +
    geom_text(aes(label = round(Xred, 3)), vjust = -0.5, size = 3) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
    ) +
    labs(
      title = paste("Raw Xred Fluorescence –", name),
      x     = "Sample (Cell_ID)",
      y     = "Xred Fluorescence"
    )

  # 2b. Save the static PNG version to the subdirectory
  filename <- file.path(plot_dir, paste0(name, "_fluorescence.png"))
  ggsave(
    filename = filename,
    plot     = p,
    width    = 10,
    height   = 6,
    dpi      = 300,
    bg       = "white"
  )

  # 2c. Convert to an interactive Plotly object and store
  plots[[name]] <- ggplotly(p)
}

# 3. Display all interactive plots in the R Markdown HTML output
library(htmltools)
tagList(plots)

Each Fluorescence run’s bar chart shows the raw Xred value for every sample (Cell_ID).

Static PNG files (Fluor1_fluorescence.png, Fluor2_fluorescence.png, etc.) have been saved in the working directory.

Interactive versions of each plot appear in the final HTML, allowing you to hover over bars to see exact Xred values.

STEP 8: FLUORESENCE DATA JOIN

# Create subdirectory for this join group
pe_fluor_dir <- file.path("output PE", "export data", "pe_fluor_joins")
dir.create(pe_fluor_dir, recursive = TRUE, showWarnings = FALSE)

# Run joins with explicit output paths
joindf_by_id(
  df1 = PE1_calc,
  df2 = Fluor1_tidy,
  output_name   = file.path(pe_fluor_dir, "pe_fluor1_joined.csv"),
  unmatched_out = file.path(pe_fluor_dir, "pe_fluor1_unmatched.csv"),
  key_df1 = "join_id",
  key_df2 = "Cell_ID"
)
## Join complete. Output saved to: output PE/export data/pe_fluor_joins/pe_fluor1_joined.csv
## $result_df
## # A tibble: 80 × 15
##    join_id  X455  X564   X592   X618    X645   X565 ID      `sample weight`
##    <chr>   <dbl> <dbl>  <dbl>  <dbl>   <dbl>  <dbl> <chr>             <dbl>
##  1 A01     0     0     0      0      0       0      Blank01             0  
##  2 A02     0.027 0.018 0.012  0.013  0.01    0.017  Lab_01             10.4
##  3 A03     0.026 0.019 0.012  0.012  0.009   0.019  Lab_01             10.3
##  4 A04     0.018 0.012 0.008  0.009  0.007   0.012  Lab_01             11.1
##  5 A06     0.012 0.007 0.0050 0.0050 0.00300 0.0050 Lab_02             10.2
##  6 A08     0.012 0.009 0.006  0.006  0.00400 0.008  Lab_03             10.6
##  7 A09     0.026 0.012 0.007  0.008  0.00400 0.01   Lab_03             10.5
##  8 A10     0.059 0.052 0.043  0.04   0.035   0.047  Lab_03             10.1
##  9 A12     0.014 0.007 0.0050 0.0050 0.00300 0.007  Lab_04             10.6
## 10 B02     0.015 0.008 0.0050 0.0050 0.00400 0.008  Lab_05             10.9
## # ℹ 70 more rows
## # ℹ 6 more variables: `Tray well` <chr>, date <dttm>, PE_mg_per_mL <dbl>,
## #   total_PE_mg <dbl>, PE_mg_per_g_sample <dbl>, Xred <dbl>
## 
## $merged_cells
## [1] 80
## 
## $unmatched_cells
## [1] 16
## 
## $unmatched_saved_to
## [1] "output PE/export data/pe_fluor_joins/pe_fluor1_unmatched.csv"
## 
## $assigned_to_global
## [1] TRUE
joindf_by_id(
  df1 = PE2_calc,
  df2 = Fluor2_tidy,
  output_name   = file.path(pe_fluor_dir, "pe_fluor2_joined.csv"),
  unmatched_out = file.path(pe_fluor_dir, "pe_fluor2_unmatched.csv"),
  key_df1 = "join_id",
  key_df2 = "Cell_ID"
)
## Join complete. Output saved to: output PE/export data/pe_fluor_joins/pe_fluor2_joined.csv
## $result_df
## # A tibble: 92 × 13
##    join_id     X564     X592     X455     X565 ID    `sample weight` `Tray well`
##    <chr>      <dbl>    <dbl>    <dbl>    <dbl> <chr>           <dbl> <chr>      
##  1 A02     -6.67e-4 -1.67e-3 -1.67e-3 -3.33e-4 Blan…            NA   A2         
##  2 A03     -1.67e-3 -2.67e-3 -6.67e-4 -1.33e-3 Blan…            NA   A3         
##  3 A04      1.33e-2  8.33e-3  1.23e-2  1.27e-2 Lab_…            24.5 A4         
##  4 A06      1.13e-2  4.33e-3  1.33e-2  1.17e-2 Lab_…            24.1 A6         
##  5 A07      3.33e-3  3.33e-4  7.33e-3  3.67e-3 Lab_…            23.4 A7         
##  6 A08      3.33e-3 -6.67e-4  8.33e-3  3.67e-3 Lab_…            23.7 A8         
##  7 A09      1.33e-3 -1.67e-3  7.33e-3  1.67e-3 Lab_…            25.6 A9         
##  8 A10      1.13e-2  5.33e-3  1.83e-2  1.07e-2 Lab_…            23   A10        
##  9 A11      7.33e-3  1.33e-3  1.23e-2  7.67e-3 Lab_…            23.6 A11        
## 10 A12      8.33e-3  1.33e-3  8.33e-3  8.67e-3 Lab_…            26.9 A12        
## # ℹ 82 more rows
## # ℹ 5 more variables: date <dttm>, PE_mg_per_mL <dbl>, total_PE_mg <dbl>,
## #   PE_mg_per_g_sample <dbl>, Xred <dbl>
## 
## $merged_cells
## [1] 92
## 
## $unmatched_cells
## [1] 4
## 
## $unmatched_saved_to
## [1] "output PE/export data/pe_fluor_joins/pe_fluor2_unmatched.csv"
## 
## $assigned_to_global
## [1] TRUE
joindf_by_id(
  df1 = PE3_calc,
  df2 = Fluor3_tidy,
  output_name   = file.path(pe_fluor_dir, "pe_fluor3_joined.csv"),
  unmatched_out = file.path(pe_fluor_dir, "pe_fluor3_unmatched.csv"),
  key_df1 = "join_id",
  key_df2 = "Cell_ID"
)
## Join complete. Output saved to: output PE/export data/pe_fluor_joins/pe_fluor3_joined.csv
## $result_df
## # A tibble: 49 × 13
##    ID     `sample weight` `Tray well` join_id date                  X564   X565
##    <chr>            <dbl> <chr>       <chr>   <dttm>               <dbl>  <dbl>
##  1 Lab_40            26.8 G1          A01     2025-05-15 00:00:00 0.0193 0.0193
##  2 Lab_40            24.2 G2          A02     2025-05-15 00:00:00 0.0133 0.0133
##  3 Lab_40            26.7 G3          A03     2025-05-15 00:00:00 0.0223 0.0223
##  4 Lab_41            23.4 G4          A04     2025-05-15 00:00:00 0.0223 0.0223
##  5 Lab_41            25.6 G5          A05     2025-05-15 00:00:00 0.0283 0.0273
##  6 Lab_41            27   G6          A06     2025-05-15 00:00:00 0.0343 0.0333
##  7 Lab_42            25   G7          A07     2025-05-15 00:00:00 0.0183 0.0183
##  8 Lab_42            25.7 G8          A08     2025-05-15 00:00:00 0.0193 0.0193
##  9 Lab_42            24.6 G9          A09     2025-05-15 00:00:00 0.0223 0.0223
## 10 Lab_43            25.6 G10         A10     2025-05-15 00:00:00 0.0603 0.0593
## # ℹ 39 more rows
## # ℹ 6 more variables: X592 <dbl>, X455 <dbl>, PE_mg_per_mL <dbl>,
## #   total_PE_mg <dbl>, PE_mg_per_g_sample <dbl>, Xred <dbl>
## 
## $merged_cells
## [1] 49
## 
## $unmatched_cells
## [1] 47
## 
## $unmatched_saved_to
## [1] "output PE/export data/pe_fluor_joins/pe_fluor3_unmatched.csv"
## 
## $assigned_to_global
## [1] TRUE
joindf_by_id(
  df1 = PE4_calc,
  df2 = Fluor4_tidy,
  output_name   = file.path(pe_fluor_dir, "pe_fluor4_joined.csv"),
  unmatched_out = file.path(pe_fluor_dir, "pe_fluor4_unmatched.csv"),
  key_df1 = "join_id",
  key_df2 = "Cell_ID"
)
## Join complete. Output saved to: output PE/export data/pe_fluor_joins/pe_fluor4_joined.csv
## $result_df
## # A tibble: 74 × 15
##    ID     `sample weight` `tray well` join_id date        X564     X592     X455
##    <chr>            <dbl>       <dbl> <chr>   <chr>      <dbl>    <dbl>    <dbl>
##  1 Lab_01            23.6           1 A01     29/05/… -1.33e-3 -4.33e-3  4.67e-3
##  2 Lab_01            24             2 A02     29/05/… -3.33e-3 -5.33e-3 -1.33e-3
##  3 Lab_03            25.4           5 A05     29/05/…  6.67e-4 -4.33e-3  1.07e-2
##  4 Lab_03            25.2           6 A06     29/05/…  2.57e-2  7.67e-3  3.57e-2
##  5 Lab_04            26.2           7 A07     29/05/…  2.67e-3 -2.33e-3  1.27e-2
##  6 Lab_04            25.2           8 A08     29/05/…  1.07e-2  6.67e-4  1.17e-2
##  7 Lab_05            23.3           9 A09     29/05/… -1.33e-3 -4.33e-3  5.67e-3
##  8 Lab_05            23.3          10 A10     29/05/… -2.33e-3 -4.33e-3  6.67e-4
##  9 Lab_12            26.3          11 A11     29/05/…  2.67e-3 -1.33e-3  1.57e-2
## 10 Lab_12            26.5          12 A12     29/05/…  2.67e-3 -2.33e-3  1.37e-2
## # ℹ 64 more rows
## # ℹ 7 more variables: X618 <dbl>, X645 <dbl>, `X592[2]` <dbl>,
## #   PE_mg_per_mL <dbl>, total_PE_mg <dbl>, PE_mg_per_g_sample <dbl>, Xred <dbl>
## 
## $merged_cells
## [1] 74
## 
## $unmatched_cells
## [1] 22
## 
## $unmatched_saved_to
## [1] "output PE/export data/pe_fluor_joins/pe_fluor4_unmatched.csv"
## 
## $assigned_to_global
## [1] TRUE
# Rename columns in one of the joined results (after it has been auto-assigned)
#colnames(pe_fluor4)[colnames(pe_fluor4_joined) == "sample ID"] <- "ID"

# Combine all joined dataframes into a list
pe_fluor_all <- list(pe_fluor1_joined, pe_fluor2_joined, pe_fluor3_joined, pe_fluor4_joined)

# Add a run identifier to each dataframe
pe_fluor_all <- Map(function(df, i) {
  df$run <- factor(paste0("run", i))
  df
}, pe_fluor_all, seq_along(pe_fluor_all))

# Find common columns across all runs
common_cols <- Reduce(intersect, lapply(pe_fluor_all, names))

#harmonize date columns
pe_fluor_all <- lapply(pe_fluor_all, function(df) {
  df[] <- lapply(df, function(col) {
    is_colname_date <- grepl("date", names(df)[which(sapply(df, identical, col))], ignore.case = TRUE)
    if (inherits(col, "character") && is_colname_date) {
      tryCatch(as.Date(col, format = "%m/%d/%Y"),
               error = function(e) as.Date(col, format = "%d/%m/%Y"))
    } else {
      col
    }
  })
  df
})



# Keep only the common columns in each dataframe before binding
combined_df <- bind_rows(
  lapply(pe_fluor_all, function(df) df[common_cols]),
  .id = "run"
)
#Finally merge all spreadsheets into one for replicate testing and analysis
#Scale PE bigger for regression purposes
combined_df <- combined_df %>%
  mutate(PE_scaled = PE_mg_per_g_sample * 1000)

Each run’s PE + Fluorescence data was merged and saved (pe_fluor1, …, pe_fluor4).

combined_df now contains all four runs, only columns common to every run, a run factor, fixed date parsing, and a new column PE_scaled ready for regression analyses.

STEP 10: FULL DATASET PLOT WITH POINTS COLORED BY RUN

  • We want to visualize all data points (Xred vs. PE_mg_per_g_sample) across every run in a single scatterplot.
  • Each point will be colored according to its run factor, so we can see how the different runs overlap or differ.
  • We will hard‐code the x‐axis limit from 0 to 0.5 in order to zoom in on the main cluster of values, while still retaining any outliers outside that range.
  • Finally, we will save this combined plot as Xred_PE_all_runs.png and display it in the HTML.
# Build a scatterplot of Xred vs. PE_mg_per_g_sample, colored by run
# Define directory for this plot group
pe_fluor_plot_dir <- file.path("output PE", "plots", "pe_fluor")
dir.create(pe_fluor_plot_dir, recursive = TRUE, showWarnings = FALSE)

# Create combined plot
p_all <- ggplot(combined_df, aes(x = PE_mg_per_g_sample, y = Xred, color = run)) +
  geom_point(alpha = 0.6) +
  theme_minimal() +
  labs(
    title = "Xred vs. PE_mg_per_g_sample (All Runs)",
    x     = "PE (mg/g sample)",
    y     = "Xred Fluorescence",
    color = "Run"
  ) +
  scale_color_brewer(palette = "Dark2") +
  coord_cartesian(xlim = c(0, 0.5))

# Save the plot to the new subdirectory
ggsave(
  filename = file.path(pe_fluor_plot_dir, "Xred_vs_PE_all_runs.png"),
  plot     = p_all,
  width    = 10,
  height   = 6,
  dpi      = 300,
  bg       = "white"
)
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Display the plot in the knitted HTML
print(p_all)
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).

The scatterplot shows how Xred (fluorescence) increases with PE_mg_per_g_sample (phycoerythrin content), with each run distinguished by color.

Most points from every run lie between 0 – 0.5 mg/g on the x‐axis, confirming that the hard‐coded limit captures the bulk of the data.

Outliers beyond PE = 0.5 mg/g are clipped from view but still exist in the dataset.

STEP 11: CLEAN & ADJUST FACTOR LEVELS FOR RUNS

Plain English:

  • We discovered a set of unwanted rows (indices 190–194) in combined_df that need to be removed.
  • After removing them, we add a new factor level "run 3 fresh" to the run column.
  • We then assign "run 3 fresh" to samples in rows 169–189 (adjust indices if your row count shifts).
combined_df <- combined_df[!is.na(combined_df$PE_mg_per_g_sample), ]



# 1. Add a new factor level "run 3 fresh" to the 'run' factor
levels(combined_df$run) <- c(levels(combined_df$run), "fresh run 3 & 4")

# 2. Assign "fresh run 3 & 4 to fresh samples only. 
combined_df$run[grepl("fresh", combined_df$ID, ignore.case = TRUE)] <- "fresh run 3 & 4"

# 3. Remove rows with NA values for PE mg. There was fluoresence data, but no phycoerythrin
combined_export_path <- file.path("output PE", "export data", "combined_df.csv")
write.csv(combined_df, combined_export_path, row.names = FALSE)

A new level “run 3 fresh” has been added

Fluoresence phycoerythrin estimation fitting

# ── Subset to run 2, drop missing PE/Xred, and filter PE ≥ 10 µg/g (i.e. ≥ 0.01 mg/g) ─────
combined_df_run2 <- combined_df %>%
  filter(run == "2") %>%                                  # keep only run 2
  drop_na(PE_mg_per_g_sample, Xred) %>%                   # drop any NA in PE or Xred
  filter(PE_mg_per_g_sample >= 0.01)                       # remove PE < 0.01 mg/g
## STEP 11: REGRESSION MODELS BY RUN
# ── Fit an initial OLS (PE_mg_per_g_sample ~ Xred) and flag statistical outliers ─────────
model_initial_run2 <- lm(PE_mg_per_g_sample ~ Xred, data = combined_df_run2)

combined_df_run2 <- combined_df_run2 %>%
  mutate(
    resid_initial = residuals(model_initial_run2),
    std_resid      = rstandard(model_initial_run2),
    is_outlier     = abs(std_resid) >= 2
  )

# How many points and how many outliers?
combined_df_run2 %>%
  summarize(
    total_points = n(),
    n_outliers   = sum(is_outlier),
    pct_outliers = mean(is_outlier) * 100
  ) %>%
  print()
## # A tibble: 1 × 3
##   total_points n_outliers pct_outliers
##          <int>      <int>        <dbl>
## 1           81          4         4.94
#  Visualize initial model with outliers highlighted
ggplot(combined_df_run2, aes(x = Xred, y = PE_mg_per_g_sample, color = is_outlier)) +
  geom_point(size = 2, alpha = 0.7) +
  scale_color_manual(values = c("FALSE" = "steelblue", "TRUE" = "firebrick")) +
  geom_abline(
    slope     = coef(model_initial_run2)["Xred"],
    intercept = coef(model_initial_run2)["(Intercept)"],
    color     = "darkgreen", size = 1
  ) +
  labs(
    title    = "Run 2: Initial PE vs. Xred (Red = |Std Resid| ≥ 2)",
    x        = "Xred (raw fluorescence)",
    y        = "PE (mg per g sample)"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# ── Remove flagged outliers and refit the “clean” calibration model ───────────────────────
combined_df_run2_clean <- combined_df_run2 %>%
  filter(!is_outlier)

model_clean_run2 <- lm(PE_mg_per_g_sample ~ Xred, data = combined_df_run2_clean)

# Summarize the clean model
tidy(model_clean_run2) %>% print()
## # A tibble: 2 × 5
##   term          estimate   std.error statistic  p.value
##   <chr>            <dbl>       <dbl>     <dbl>    <dbl>
## 1 (Intercept) 0.00268    0.00158          1.69 9.52e- 2
## 2 Xred        0.00000527 0.000000154     34.3  1.49e-47
summary(model_clean_run2)$r.squared %>% print()
## [1] 0.9399393
# Visualize the final fit on run 2 (outliers removed)
ggplot(combined_df_run2_clean, aes(x = Xred, y = PE_mg_per_g_sample)) +
  geom_point(color = "steelblue", size = 2, alpha = 0.7) +
  geom_smooth(method = "lm", color = "darkgreen", se = FALSE) +
  labs(
    title = "Run 2: Final Calibration (Outliers Removed)",
    x     = "Xred (raw fluorescence)",
    y     = "PE (mg per g sample)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# ── Define a function to add fluorescence‐predicted PE to any dataframe ──────────────────

# Extract coefficients from the clean run‐2 model
coef_intercept <- coef(model_clean_run2)["(Intercept)"]
coef_slope     <- coef(model_clean_run2)["Xred"]

#' calc_PE_from_Xred
#'
#' Given a dataframe containing a column of fluorescence values (Xred),
#' adds a new column with predicted PE (mg/g) based on the cleaned Run 2 calibration.
#'
#' @param df        A data frame.
#' @param fluor_col String: name of the column in df containing the fluorescence (Xred) values.
#' @param new_col   String: desired name for the new predicted PE column (default = "PE_predicted_mg_per_g").
#' @return          A new data frame with an additional column `new_col`.

# apply to the master dataset
combined_df <- calc_PE_from_Xred(combined_df, fluor_col = "Xred", new_col = "PE_pred_run2_mg_per_g")

# View the first few rows to confirm
combined_df %>%
  select(join_id, run, Xred, PE_mg_per_g_sample, PE_pred_run2_mg_per_g) %>%
  head(10) %>%
  print()
## # A tibble: 10 × 5
##    join_id run    Xred PE_mg_per_g_sample PE_pred_run2_mg_per_g
##    <chr>   <chr> <dbl>              <dbl>                 <dbl>
##  1 A02     1     16642            0.0519                 0.0904
##  2 A03     1     24823            0.0734                 0.134 
##  3 A04     1     15650            0.0324                 0.0852
##  4 A06     1      2119            0.0106                 0.0138
##  5 A08     1     10796            0.0306                 0.0596
##  6 A09     1     21089            0.0206                 0.114 
##  7 A10     1     32338            0.103                  0.173 
##  8 A12     1      7922            0.00340                0.0444
##  9 B02     1      6026            0.0165                 0.0344
## 10 B03     1      3957            0.00947                0.0235
# ── (Optional) Function to also add standard‐error-of‐fit using the clean Run 2 model ────────

#' add_PE_with_se
#'
#' Given a dataframe containing a column of fluorescence values (Xred),
#' adds two new columns:
#'   1) Predicted PE (mg/g) based on the cleaned Run 2 model
#'   2) Standard error of the fitted mean (SE) for each prediction
#'
#' @param df        A data frame.
#' @param fluor_col String: name of the column in df containing the fluorescence (Xred) values.
#' @param pred_col  String: name of the new predicted PE column.
#' @param se_col    String: name of the new standard‐error column.
#' @return          A new data frame with additional columns `pred_col` and `se_col`.

# Example: apply to the master dataset
combined_df <- add_PE_with_se(combined_df,
                              fluor_col = "Xred",
                              pred_col  = "PE_pred_run2_mg_per_g",
                              se_col    = "PE_se_run2")

# View the first few rows to confirm
combined_df %>%
  select(join_id, run, Xred, PE_mg_per_g_sample,
         PE_pred_run2_mg_per_g, PE_se_run2) %>%
  head(10) %>%
  print()
## # A tibble: 10 × 6
##    join_id run    Xred PE_mg_per_g_sample PE_pred_run2_mg_per_g PE_se_run2
##    <chr>   <chr> <dbl>              <dbl>                 <dbl>      <dbl>
##  1 A02     1     16642            0.0519                 0.0904    0.00168
##  2 A03     1     24823            0.0734                 0.134     0.00279
##  3 A04     1     15650            0.0324                 0.0852    0.00156
##  4 A06     1      2119            0.0106                 0.0138    0.00135
##  5 A08     1     10796            0.0306                 0.0596    0.00110
##  6 A09     1     21089            0.0206                 0.114     0.00226
##  7 A10     1     32338            0.103                  0.173     0.00389
##  8 A12     1      7922            0.00340                0.0444    0.00101
##  9 B02     1      6026            0.0165                 0.0344    0.00105
## 10 B03     1      3957            0.00947                0.0235    0.00118
# Compute differences between measured and predicted PE
de <- "output PE/plots"
if (!dir.exists(de)) dir.create(de, recursive = TRUE)
df_compare <- combined_df %>%
  filter(!is.na(PE_mg_per_g_sample), !is.na(PE_pred_run2_mg_per_g)) %>%
  mutate(
    diff     = PE_mg_per_g_sample - PE_pred_run2_mg_per_g,
    diff_abs = abs(diff),
    mean_PE  = (PE_mg_per_g_sample + PE_pred_run2_mg_per_g) / 2
  )

# 1) Scatter: Measured vs Predicted PE
p1 <- ggplot(df_compare, aes(x = PE_pred_run2_mg_per_g, y = PE_mg_per_g_sample, color = run)) +
  geom_point(size = 2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  geom_text_repel(aes(label = join_id), size = 3, max.overlaps = 20) +
  labs(
    title = "Measured vs. Predicted PE by Run",
    x = "Predicted PE (mg/g)",
    y = "Measured PE (mg/g)",
    color = "Run"
  ) +
  theme_minimal()
print(p1)

# Save plot
ggsave(filename = file.path("output PE/plots", "measured_vs_predicted_PE.png"), plot = p1, width = 6, height = 4)

# 2) Residuals vs Predicted
p2 <- ggplot(df_compare, aes(x = PE_pred_run2_mg_per_g, y = diff)) +
  geom_point( size = 2) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(
    title = "Residuals vs. Predicted",
    x = "Predicted PE (mg/g)",
    y = "Measured – Predicted (mg/g)",
    color = "Run"
  ) +
  theme_minimal()

print(p2)

# Save plot
ggsave(filename = file.path(de, "residuals_vs_predicted_PE.png"), plot = p2, width = 6, height = 4)

# 3) Bland–Altman plot
p3 <- ggplot(df_compare, aes(x = mean_PE, y = diff)) +
  geom_point(color = "darkgreen", size = 2) +
  geom_hline(yintercept = mean(df_compare$diff), color = "blue") +
  geom_hline(yintercept = mean(df_compare$diff) + 1.96 * sd(df_compare$diff),
             linetype = "dashed", color = "grey50") +
  geom_hline(yintercept = mean(df_compare$diff) - 1.96 * sd(df_compare$diff),
             linetype = "dashed", color = "grey50") +
  labs(
    title = "Bland–Altman Plot",
    x = "Mean of Measured & Predicted PE (mg/g)",
    y = "Difference (Measured – Predicted)"
  ) +
  theme_minimal()
print(p3)

# Save plot
ggsave(filename = file.path(de, "bland_altman_PE.png"), plot = p3, width = 6, height = 4)
# Extract and save top 20 samples with largest absolute differences
top20_diff <- df_compare %>%
  arrange(desc(diff_abs)) %>%
  slice(1:20) %>%
  select(join_id, run, PE_mg_per_g_sample, PE_pred_run2_mg_per_g, diff, diff_abs)

# Print top 20 to console
print(top20_diff)
## # A tibble: 20 × 6
##    join_id run   PE_mg_per_g_sample PE_pred_run2_mg_per_g     diff diff_abs
##    <chr>   <chr>              <dbl>                 <dbl>    <dbl>    <dbl>
##  1 G07     4              Inf                     0.00268 Inf      Inf     
##  2 G06     4                0.539                 0.203     0.336    0.336 
##  3 B09     1                0.00686               0.224    -0.217    0.217 
##  4 B07     1                0.238                 0.0417    0.196    0.196 
##  5 G04     4                0.338                 0.159     0.180    0.180 
##  6 E01     3                0.498                 0.333     0.165    0.165 
##  7 D02     1                0.11                  0.255    -0.145    0.145 
##  8 C12     3                0.590                 0.450     0.140    0.140 
##  9 C03     1                0.129                 0.239    -0.109    0.109 
## 10 G03     4                0.224                 0.119     0.106    0.106 
## 11 H06     2                0.344                 0.241     0.103    0.103 
## 12 D03     1                0.101                 0.202    -0.101    0.101 
## 13 F01     1                0.127                 0.225    -0.0973   0.0973
## 14 G12     1                0.0675                0.164    -0.0961   0.0961
## 15 F03     1                0.127                 0.223    -0.0956   0.0956
## 16 F06     1                0.0928                0.187    -0.0940   0.0940
## 17 A09     1                0.0206                0.114    -0.0933   0.0933
## 18 F05     1                0.107                 0.198    -0.0912   0.0912
## 19 F04     1                0.130                 0.217    -0.0872   0.0872
## 20 H02     1                0.0648                0.147    -0.0822   0.0822
# Create export directory for data if it doesn't exist
export_dir <- "output PE/export data"
if (!dir.exists(export_dir)) dir.create(export_dir, recursive = TRUE)


# Save to CSV in the export directory
write_csv(top20_diff, file.path(export_dir, "top20_PE_differences.csv"))

For each unique run in combined_df, we will:
1. Filter to that run’s subset of data.
2. If the run has fewer than 4 samples, skip with a warning.
3. Fit three regression models:
- Linear:
\[ X_{\text{red}} \sim PE\_scaled \]
- Log:
\[ X_{\text{red}} \sim \log\bigl(PE\_scaled + 0.001\bigr) \]
- Polynomial:
\[ X_{\text{red}} \sim PE\_scaled \;+\; I\bigl(PE\_scaled^2\bigr) \]
4. Extract R² and p‐values for each model using get_stats().
5. Build a ggplot that:
- Plots raw points (geom_point).
- Adds fitted lines:
- Blue = linear model
- Green (dashed) = log model
- Red (dot‐dash) = polynomial model
- Annotates each line’s R²/p‐value in the top‐right corner.
6. Save each run’s plot under plots/regression/<run>/Xred_PE_regressions_<run>.png.
7. Print each plot so it appears in the knitted HTML.

# 1. Ensure 'run' is treated as a factor
combined_df$run <- as.factor(combined_df$run)

# 2. Create base directory for all regression plots
reg_dir <- file.path("output PE", "plots", "pe_fluor_regressions")
dir.create(reg_dir, recursive = TRUE, showWarnings = FALSE)

# 3. Loop over each unique run identifier
for (r in unique(as.character(combined_df$run))) {
  # 3a. Subset to only this run
  # 3a. Subset to only this run
df_sub <- combined_df %>% filter(run == r)

# 3a.1 Clean the data
df_sub <- df_sub %>%
  filter(!is.na(PE_scaled), !is.na(Xred), 
         is.finite(PE_scaled), is.finite(Xred), 
         PE_scaled > -0.001)

# 3b. Skip if too few data points
if (nrow(df_sub) < 4) {
  warning(paste("Skipping run", r, "- not enough data (n =", nrow(df_sub), ")"))
  next
}
  
  # 3c. Fit models
  model_linear <- lm(Xred ~ PE_scaled, data = df_sub)
  model_log    <- lm(Xred ~ log(PE_scaled + 0.001), data = df_sub)
  model_poly   <- lm(Xred ~ PE_scaled + I(PE_scaled^2), data = df_sub)
  
  # 3d. Get stats
  ann_linear <- get_stats(model_linear, "Linear")
  ann_log    <- get_stats(model_log,    "Log")
  ann_poly   <- get_stats(model_poly,   "Poly")
  
  # 3e. Create plot
  p <- ggplot(df_sub, aes(x = PE_scaled, y = Xred)) +
    geom_point(alpha = 0.6, color = "black") +
    stat_smooth(method = "lm", formula = y ~ x, 
                se = FALSE, color = "blue") +
    stat_smooth(method = "lm", formula = y ~ log(x + 0.001), 
                se = FALSE, color = "green", linetype = "dashed") +
    stat_smooth(method = "lm", formula = y ~ x + I(x^2), 
                se = FALSE, color = "red", linetype = "dotdash") +
    annotate("text", x = Inf, y = Inf, label = ann_linear, 
             hjust = 1.05, vjust = 2, color = "blue", size = 4) +
    annotate("text", x = Inf, y = Inf, label = ann_log, 
             hjust = 1.05, vjust = 3.5, color = "green", size = 4) +
    annotate("text", x = Inf, y = Inf, label = ann_poly, 
             hjust = 1.05, vjust = 5, color = "red", size = 4) +
    theme_minimal() +
    labs(
      title = paste("Regression Models: Xred vs PE (", r, ")"),
      x     = "PE_scaled (mg/g sample × 1000)",
      y     = "Xred Fluorescence"
    )
  
  # 3f. Save the plot (all to one shared folder)
  filename <- paste0("Xred_PE_regressions_", r, ".png")
  ggsave(
    filename = file.path(reg_dir, filename),
    plot     = p,
    width    = 10,
    height   = 6,
    dpi      = 300,
    bg       = "white"
  )
  
  # 3g. Show plot in knitted HTML
  print(p)
}

STEP 12: ANALYZE TECHNICAL REPLICATES

  • We need to process replicate measurements for each sample to calculate summary statistics (mean, standard deviation, standard error, coefficient of variation, etc.).
  • The analyze_replicates() function will:
    1. Group data by a sample identifier (id_col) and perform calculations on numeric columns (e.g., PE_mg_per_g_sample, Xred).
    2. Optionally select the best 3 replicates (when choose_best_3 = TRUE) by minimizing the coefficient of variation (CV) before calculating summary stats.
    3. Compute per‐sample means, SDs, SEs, CVs, and maximum deviation percentages for each variable.
    4. Record the number of replicates, average weights, and list which replicates were included/excluded.
    5. Write a CSV file (<output_prefix>_summary.csv) with one row per sample, containing all summary metrics.
    6. Return a summary data frame in the global environment (named <output_prefix>_summary).

Below we run two passes of analyze_replicates() on combined_df: 1. Without enhanced CV‐based selection (choose_best_3 = FALSE), saving as all_rep_analy_summary.csv.
2. With enhanced selection (choose_best_3 = TRUE), saving as E_rep_analy_summary.csv.
Finally, we use graph_histograms_with_error() to plot histograms with error bars for key variables (PE_mg_per_g_sample and Xred).

# 1. Analyze replicates without enhanced (best-3) selection
analyze_replicates(
  data          = combined_df,
  id_col        = "ID",         # column that uniquely identifies each sample
  join_col      = "join_id",         # also used for joining, same as id_col here
  weight_col    = "sample weight",   # column containing sample weight in grams
  date_col      = "date",            # column containing sample collection date
  output_prefix = "all_rep_analy",   # prefix for output files (will produce all_rep_analy_summary.csv)
  choose_best_3 = FALSE, # do not filter replicates, use all
  dir = "output PE/export data/"
)
## Summary saved to: output PE/export data/all_rep_analy_summary.csv
## # A tibble: 59 × 65
##    ID       PE_mg_per_g_sample_mean PE_mg_per_mL_mean PE_pred_run2_mg_per_g_mean
##    <fct>                      <dbl>             <dbl>                      <dbl>
##  1 BLANK_01                Inf              0.0000960                    0.00268
##  2 Lab_01                    0.0351         0.000360                     0.0558 
##  3 Lab_02                    0.0120         0.000168                     0.0100 
##  4 Lab_03                    0.0438         0.000540                     0.0687 
##  5 Lab_04                    0.0254         0.000412                     0.0295 
##  6 Lab_05                    0.0199         0.000279                     0.0251 
##  7 Lab_06                    0.0972         0.00101                      0.0475 
##  8 Lab_07                    0.0548         0.000596                     0.101  
##  9 Lab_08                    0.0321         0.000440                     0.0462 
## 10 Lab_09                    0.0693         0.0006                       0.101  
## # ℹ 49 more rows
## # ℹ 61 more variables: PE_scaled_mean <dbl>, PE_se_run2_mean <dbl>,
## #   X455_mean <dbl>, X564_mean <dbl>, X592_mean <dbl>, Xred_mean <dbl>,
## #   total_PE_mg_mean <dbl>, PE_mg_per_g_sample_sd <dbl>, PE_mg_per_mL_sd <dbl>,
## #   PE_pred_run2_mg_per_g_sd <dbl>, PE_scaled_sd <dbl>, PE_se_run2_sd <dbl>,
## #   X455_sd <dbl>, X564_sd <dbl>, X592_sd <dbl>, Xred_sd <dbl>,
## #   total_PE_mg_sd <dbl>, PE_mg_per_g_sample_se <dbl>, PE_mg_per_mL_se <dbl>, …
# 2. Analyze replicates with enhanced (best-3) selection by CV
analyze_replicates(
  data          = combined_df,
  id_col        = "ID",              # column uniquely identifying each sample
  join_col      = "join_id",         # join key for replicate grouping
  weight_col    = "sample weight",   # sample weight column (grams)
  date_col      = "date",            # collection date column
  output_prefix = "E_rep_analy",     # prefix for output files (will produce E_rep_analy_summary.csv)
  choose_best_3 = TRUE,
  dir = "output PE/export data/"
  # select the best 3 replicates (lowest CV) per sample
)
## Summary saved to: output PE/export data/E_rep_analy_summary.csv
## # A tibble: 59 × 85
##    ID       PE_mg_per_g_sample_mean PE_mg_per_mL_mean PE_pred_run2_mg_per_g_mean
##    <fct>                      <dbl>             <dbl>                      <dbl>
##  1 BLANK_01                Inf              0.0000960                    0.00268
##  2 Lab_01                    0.0340         0.000544                     0.103  
##  3 Lab_02                    0.0104         0.000200                     0.0111 
##  4 Lab_03                    0.0287         0.000608                     0.0255 
##  5 Lab_04                    0.0114         0.000192                     0.0449 
##  6 Lab_05                    0.0163         0.000120                     0.0294 
##  7 Lab_06                    0.0755         0.00142                      0.0441 
##  8 Lab_07                    0.0843         0.000728                     0.166  
##  9 Lab_08                    0.0441         0.000744                     0.0448 
## 10 Lab_09                    0.112          0.000760                     0.0271 
## # ℹ 49 more rows
## # ℹ 81 more variables: PE_scaled_mean <dbl>, PE_se_run2_mean <dbl>,
## #   X455_mean <dbl>, X564_mean <dbl>, X592_mean <dbl>, Xred_mean <dbl>,
## #   total_PE_mg_mean <dbl>, PE_mg_per_g_sample_sd <dbl>, PE_mg_per_mL_sd <dbl>,
## #   PE_pred_run2_mg_per_g_sd <dbl>, PE_scaled_sd <dbl>, PE_se_run2_sd <dbl>,
## #   X455_sd <dbl>, X564_sd <dbl>, X592_sd <dbl>, Xred_sd <dbl>,
## #   total_PE_mg_sd <dbl>, PE_mg_per_g_sample_se <dbl>, PE_mg_per_mL_se <dbl>, …
# 3. Generate histograms with error bars for key variables
## SAVE HISTOGRAMS WITH ERROR BARS AS PNGS
# Create output folder if it doesn't exist

STEP 12.5: GENERATE HISTOGRAMS WITH ERROR BARS FOR MULTIPLE VARIABLES

  • We want to create a bar plot (histogram) with error bars for each variable in a user‐specified list (e.g., "PE_mg_per_g_sample", "Xred").
  • For each variable, we assume there is a corresponding standard error column named "<variable>_se" (e.g., "PE_mg_per_g_sample_se", "Xred_se").
  • We will call graph_replicates_custom_error() once for each pair (value_col, se_col).
  • The function will create its own output folder named "<variable>_replicate_analysis_plots" (based on output_prefix) and save a PNG of the bar plot with error bars.
  • Finally, we collect the returned interactive Plotly objects in a list, so they can be displayed in the HTML if desired.
# 1. Define the variables to plot
variables <- c("PE_mg_per_g_sample", "Xred", "X564")

# 2. Define shared output directory
rep_plot_dir <- file.path("output PE", "plots", "replicate_analysis")
dir.create(rep_plot_dir, recursive = TRUE, showWarnings = FALSE)

# 3. Read summary data
E_rep_analy_summary <- read.csv("output PE/export data/E_rep_analy_summary.csv")

# 4. Initialize list for interactive plotly plots
interactive_plots <- list()

# 5. Loop through each variable and generate plots
for (var in variables) {
  se_col_name    <- paste0(var, "_se")
  mean_col_name  <- paste0(var, "_mean")
  output_prefix  <- file.path(rep_plot_dir, paste0(var, "_replicate_analysis"))

  interactive_plot <- graph_replicates_custom_error(
    data          = E_rep_analy_summary,
    id_col        = "ID",
    value_col     = mean_col_name,
    se_col        = se_col_name,
    output_prefix = output_prefix  # e.g. PE/output_PE/plots/replicate_analysis/Xred_replicate_analysis
  )

  interactive_plots[[var]] <- interactive_plot
}
## Saving 7 x 5 in image
## Plot saved to: output PE/plots/replicate_analysis/PE_mg_per_g_sample_replicate_analysis_plots/PE_mg_per_g_sample_mean_replicates.png
## 
## Saving 7 x 5 in image
## Plot saved to: output PE/plots/replicate_analysis/Xred_replicate_analysis_plots/Xred_mean_replicates.png
## 
## Saving 7 x 5 in image
## Plot saved to: output PE/plots/replicate_analysis/X564_replicate_analysis_plots/X564_mean_replicates.png
# 6. Display plots in R Markdown HTML output
htmltools::tagList(interactive_plots)

Check the effect of Lorenas data

Combined_df_before <- combined_df[combined_df$run != "4",]
analyze_replicates(
  data          = Combined_df_before,
  id_col        = "ID",              # column uniquely identifying each sample
  join_col      = "join_id",         # join key for replicate grouping
  weight_col    = "sample weight",   # sample weight column (grams)
  date_col      = "date",            # collection date column
  output_prefix = "E_rep_analy_before",
  choose_best_3 = TRUE,
  dir = "output PE/export data/"
  )
## Warning in max(abs(df_best$value - mean_val)/mean_val * 100, na.rm = TRUE): no
## non-missing arguments to max; returning -Inf
## Warning in max(abs(df_best$value - mean_val)/mean_val * 100, na.rm = TRUE): no
## non-missing arguments to max; returning -Inf
## Warning in max(abs(df_best$value - mean_val)/mean_val * 100, na.rm = TRUE): no
## non-missing arguments to max; returning -Inf
## Warning in max(abs(df_best$value - mean_val)/mean_val * 100, na.rm = TRUE): no
## non-missing arguments to max; returning -Inf
## Warning in max(abs(df_best$value - mean_val)/mean_val * 100, na.rm = TRUE): no
## non-missing arguments to max; returning -Inf
## Warning in max(abs(df_best$value - mean_val)/mean_val * 100, na.rm = TRUE): no
## non-missing arguments to max; returning -Inf
## Summary saved to: output PE/export data/E_rep_analy_before_summary.csv
## # A tibble: 48 × 85
##    ID     PE_mg_per_g_sample_mean PE_mg_per_mL_mean PE_pred_run2_mg_per_g_mean
##    <fct>                    <dbl>             <dbl>                      <dbl>
##  1 Lab_01                 0.0340          0.000544                      0.103 
##  2 Lab_02                 0.0104          0.000200                      0.0111
##  3 Lab_03                 0.0287          0.000608                      0.0271
##  4 Lab_04                 0.00791         0.000120                      0.0356
##  5 Lab_05                 0.0163          0.0000960                     0.0294
##  6 Lab_06                 0.0755          0.00142                       0.0441
##  7 Lab_07                 0.0843          0.000728                      0.166 
##  8 Lab_08                 0.0441          0.000744                      0.0448
##  9 Lab_09                 0.112           0.000760                      0.0271
## 10 Lab_10                 0.0448          0.000728                      0.0570
## # ℹ 38 more rows
## # ℹ 81 more variables: PE_scaled_mean <dbl>, PE_se_run2_mean <dbl>,
## #   X455_mean <dbl>, X564_mean <dbl>, X592_mean <dbl>, Xred_mean <dbl>,
## #   total_PE_mg_mean <dbl>, PE_mg_per_g_sample_sd <dbl>, PE_mg_per_mL_sd <dbl>,
## #   PE_pred_run2_mg_per_g_sd <dbl>, PE_scaled_sd <dbl>, PE_se_run2_sd <dbl>,
## #   X455_sd <dbl>, X564_sd <dbl>, X592_sd <dbl>, Xred_sd <dbl>,
## #   total_PE_mg_sd <dbl>, PE_mg_per_g_sample_se <dbl>, PE_mg_per_mL_se <dbl>, …
Before <- read.csv("output PE/export data/E_rep_analy_before_summary.csv")

E_rep_analy_summary <- E_rep_analy_summary[-1,]

se_change <- mean(E_rep_analy_summary$PE_mg_per_g_sample_se, na.rm = TRUE)- mean(Before$PE_mg_per_g_sample_se, na.rm=T)

paste("sample se for replicates change by:", se_change)
## [1] "sample se for replicates change by: -0.00382526703367693"

Check effects of enhancement algorithm

PErep_enhanced <- read_csv("output PE/export data/E_rep_analy_summary.csv") #retrieve analized replicates from enhanced CV 
## Rows: 59 Columns: 85
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): ID, join_ids, replicate_date
## dbl (62): PE_mg_per_g_sample_mean, PE_mg_per_mL_mean, PE_pred_run2_mg_per_g_...
## num (20): PE_mg_per_g_sample_included_rows, PE_mg_per_mL_included_rows, PE_p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
PErep_all <- read_csv("output PE/export data/all_rep_analy_summary.csv") #retrieve analized replicates without enhancement 
## Rows: 59 Columns: 65
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): ID, join_ids, replicate_date
## dbl (62): PE_mg_per_g_sample_mean, PE_mg_per_mL_mean, PE_pred_run2_mg_per_g_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Note, this is a product of analyze_replicates using enhanced replicate selection.
SE_change <- PErep_enhanced$PE_mg_per_g_sample_max_dev_pct -PErep_all$PE_mg_per_g_sample_max_dev_pct
print(SE_change)
##  [1]         NaN  -95.108168  -19.902274 -128.706311 -108.064228 -185.850844
##  [7] -108.785120  -58.363908  -36.616304  -69.382077  -75.855386  -53.173964
## [13] -243.116436  -74.728616  -97.153013  -27.285558 -248.700651 -140.641449
## [19] -140.486124  -53.367967  -69.692891  -41.812381  -56.191109  -25.362519
## [25]  -61.400035  -43.591519  -51.338031  -54.883972 -115.956778  -44.245268
## [31] -124.140828  -72.129501  -70.399559  -82.781859  -88.693591  -90.270432
## [37]  -57.064369 -147.851170 -104.640528  -15.885761  -13.205278  -21.560248
## [43]    0.000000    0.000000    0.000000    0.000000    0.000000    0.000000
## [49]    0.000000    0.000000  -60.386760    0.000000    0.000000   -7.568183
## [55] -170.931322  -57.869010  -46.732619    0.000000    0.000000
SE_change <- SE_change[-51] #remove one NaN at the end
paste("# number of replicates SE acted upon:", length(SE_change[SE_change != 0]))
## [1] "# number of replicates SE acted upon: 46"
paste("average improvement by max deviation as a percent of the mean:", abs(mean(SE_change[SE_change != 0], na.rm = TRUE)))
## [1] "average improvement by max deviation as a percent of the mean: 82.2552702172294"

STEP 13: JOIN ENHANCED REPLICATE ANALYSIS WITH SAMPLE METADATA AND RUN GROUP COMPARISONS

Plain English:

  • We want to merge the enhanced replicate‐analysis summary (PErep_enhanced) with our sample metadata (Sample data) so that we can run group‐level comparisons (e.g., by location, variety, life stage).
  • After joining:
    1. We generate histograms with custom error bars for PE_mg_per_g_sample_mean using graph_replicates_custom_error().
    2. We run compare_groups() four times to produce boxplots (with significance letters) for:
      • PE_mg_per_g_sample_mean by Location
      • Xred_mean by Location
      • PE_mg_per_g_sample_mean by variety
      • PE_mg_per_g_sample_mean by Life_S
  • Each plot is saved automatically by the respective function (if they write files), and printed to the HTML.
# 1. Merge enhanced replicate summary with metadata by column "ID"
#    This will save "PErep_final.csv" and assign the merged data frame to `PErep_final`.
# 1. Define export folder and create it if needed
final_export_dir <- file.path("output PE", "export data", "Samples Analysis Final")
dir.create(final_export_dir, recursive = TRUE, showWarnings = FALSE)

# 2. Join PErep_enhanced with Sample data
joindf_by_id(
  df1          = PErep_enhanced,
  df2          = `Sample data`,
  output_name  = file.path(final_export_dir, "PErep_final.csv"),
  unmatched_out = file.path(final_export_dir, "PErep_unmatched.csv"),
  key_df1      = "ID",
  key_df2      = "ID"
)
## Join complete. Output saved to: output PE/export data/Samples Analysis Final/PErep_final.csv
## $result_df
## # A tibble: 59 × 92
##    join_id  PE_mg_per_g_sample_mean PE_mg_per_mL_mean PE_pred_run2_mg_per_g_mean
##    <chr>                      <dbl>             <dbl>                      <dbl>
##  1 BLANK_01                Inf              0.0000960                    0.00268
##  2 Lab_01                    0.0340         0.000544                     0.103  
##  3 Lab_02                    0.0104         0.0002                       0.0111 
##  4 Lab_03                    0.0287         0.000608                     0.0255 
##  5 Lab_04                    0.0114         0.000192                     0.0449 
##  6 Lab_05                    0.0163         0.00012                      0.0294 
##  7 Lab_06                    0.0755         0.00142                      0.0441 
##  8 Lab_07                    0.0843         0.000728                     0.166  
##  9 Lab_08                    0.0441         0.000744                     0.0448 
## 10 Lab_09                    0.112          0.00076                      0.0271 
## # ℹ 49 more rows
## # ℹ 88 more variables: PE_scaled_mean <dbl>, PE_se_run2_mean <dbl>,
## #   X455_mean <dbl>, X564_mean <dbl>, X592_mean <dbl>, Xred_mean <dbl>,
## #   total_PE_mg_mean <dbl>, PE_mg_per_g_sample_sd <dbl>, PE_mg_per_mL_sd <dbl>,
## #   PE_pred_run2_mg_per_g_sd <dbl>, PE_scaled_sd <dbl>, PE_se_run2_sd <dbl>,
## #   X455_sd <dbl>, X564_sd <dbl>, X592_sd <dbl>, Xred_sd <dbl>,
## #   total_PE_mg_sd <dbl>, PE_mg_per_g_sample_se <dbl>, PE_mg_per_mL_se <dbl>, …
## 
## $merged_cells
## [1] 57
## 
## $unmatched_cells
## [1] 11
## 
## $unmatched_saved_to
## [1] "output PE/export data/Samples Analysis Final/PErep_unmatched.csv"
## 
## $assigned_to_global
## [1] TRUE
# 3. Read in the joined summary
PErep_final <- readr::read_csv(file.path(final_export_dir, "PErep_final.csv"))
## Rows: 59 Columns: 92
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): join_id, join_ids, replicate_date, Sample Code, Location, region, ...
## dbl (82): PE_mg_per_g_sample_mean, PE_mg_per_mL_mean, PE_pred_run2_mg_per_g_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 4. Define output directory for replicate analysis plots
rep_plot_dir <- file.path("output PE", "plots", "replicate_analysis")
dir.create(rep_plot_dir, recursive = TRUE, showWarnings = FALSE)

# 5. Generate histogram with error bars for PE_mg_per_g_sample_mean
graph_replicates_custom_error(
  data          = PErep_final,
  id_col        = "join_id",
  value_col     = "PE_mg_per_g_sample_mean",
  se_col        = "PE_mg_per_g_sample_se",
  output_prefix = file.path(rep_plot_dir, "E_rep_analy")
)
## Saving 7 x 5 in image
## Plot saved to: output PE/plots/replicate_analysis/E_rep_analy_plots/PE_mg_per_g_sample_mean_replicates.png
PE_location <- PErep_final %>%
  filter(!Location %in% c("Lima Market Freeze Dry", "Ilo Freeze Dry", "Ilo oven dry", "Ilo Fresh", "Lima Market Fresh"))

PE_location_cham <- PErep_final %>%
  filter(!variety %in% c("F.Glom"))
# 6. Run group comparisons and print outputs

###################################################Location
compare_groups(
  data         = PE_location_cham,
  response_var = "PE_mg_per_g_sample_mean",
  group_var    = "Location",
  subfolder_name = "PE_Location_cham_A"
)
## 
## Attaching package: 'ggpubr'
## 
## The following objects are masked from 'package:datawizard':
## 
##     mean_sd, median_mad
## 
## For one-way between subjects designs, partial eta squared is equivalent
##   to eta squared. Returning eta squared.

compare_groups(
  data           = PE_location_cham,
  response_var   = "PE_pred_run2_mg_per_g_mean",
  group_var      = "Location",
  subfolder_name = "PE_Location_cham_B"
)
## For one-way between subjects designs, partial eta squared is equivalent
##   to eta squared. Returning eta squared.

################################################Life_S
compare_groups(
  data         = PE_location_cham,
  response_var = "PE_mg_per_g_sample_mean",
  group_var    = "Life_S",
  subfolder_name = "PE_Life_S_cham_A"
)
## For one-way between subjects designs, partial eta squared is equivalent
##   to eta squared. Returning eta squared.

compare_groups(
  data         = PE_location_cham,
  response_var = "PE_pred_run2_mg_per_g_mean",
  group_var    = "Life_S",
  subfolder_name = "PE_Life_S_cham_B"
)
## For one-way between subjects designs, partial eta squared is equivalent
##   to eta squared. Returning eta squared.

###########################################Variety
PE_paracas_marcona <- PErep_final %>%
  filter(Location %in% c("Mendieta", "7H", "Caro Caido"))

compare_groups(
  data         = PE_paracas_marcona,
  response_var = "PE_mg_per_g_sample_mean",
  group_var    = "variety",
  subfolder_name = "PE_variety_A"
)

compare_groups(
  data         = PE_location_cham,
  response_var = "PE_pred_run2_mg_per_g_mean",
  group_var    = "variety",
  subfolder_name = "PE_variety_B"
)

#################################Preperation method

PE_methods <- PErep_final %>%
  filter(Location %in% c("Lima Market Freeze Dry", "Ilo Freeze Dry", "Ilo oven dry", "Ilo Fresh", "Lima Market Fresh", "Lima Market Oven Dry"))

compare_groups(
  data         = PE_methods,
  response_var = "PE_mg_per_g_sample_mean",
  group_var    = "Location",
  subfolder_name = "PE_method_A"
)
## For one-way between subjects designs, partial eta squared is equivalent
##   to eta squared. Returning eta squared.

compare_groups(
  data         = PE_methods,
  response_var = "PE_pred_run2_mg_per_g_mean",
  group_var    = "Location",
  subfolder_name = "PE_method_B"
)
## For one-way between subjects designs, partial eta squared is equivalent
##   to eta squared. Returning eta squared.

#################################Gam Cham

PE_gamtetra <- PE_location %>%
  filter(Life_S %in% c("Gam/Tetra", "Gam", "Tetra"))

compare_groups(
  data         = PE_gamtetra,
  response_var = "PE_mg_per_g_sample_mean",
  group_var    = "Location",
  subfolder_name = "PE_gamtetra_location_A"
)
## For one-way between subjects designs, partial eta squared is equivalent
##   to eta squared. Returning eta squared.

compare_groups(
  data         = PE_gamtetra,
  response_var = "PE_pred_run2_mg_per_g_mean",
  group_var    = "Location",
  subfolder_name = "PE_gamtetra_location_A"
)
## For one-way between subjects designs, partial eta squared is equivalent
##   to eta squared. Returning eta squared.